# -*- coding: utf-8 -*-
import jieba
import math
import numpy as np
a=input("请输入第一篇文章的绝对路径")
b=input("请输入第二篇文章的绝对路径")
text_sum=2
# 给出文档路径
filename1 = a
outfilename1 = "out1.txt"
inputs1 = open(filename1, 'r')
outputs1 = open(outfilename1, 'w')

filename1 = b
outfilename1 = "out2.txt"
inputs2 = open(filename1, 'r')
outputs2 = open(outfilename1, 'w')

# 对句子进行中文分词
def seg_depart(sentence):
    # 对文档中的每一行进行中文分词
    sentence_depart = jieba.cut(sentence.strip())
    # 输出结果为outstr
    outstr = ''
    # 去停用词
    for word in sentence_depart:

            if word != '\t':
                outstr += word
                outstr += " "
    return outstr

# 将输出结果写入ou.txt中
for line in inputs1:
    line_seg = seg_depart(line)
    outputs1.write(line_seg + '\n')
outputs1.close()
inputs1.close()
print("删除停用词和分词成功！！！")


# 将输出结果写入ou.txt中
for line in inputs2:
    line_seg = seg_depart(line)
    outputs2.write(line_seg + '\n')
outputs2.close()
inputs2.close()

s=''
with open("out1.txt") as f:
    while True:
        line = f.readline()
        s+=line
        if not line:
            break;
s2=''

for i in s:
    if i is not '\n':
        s2+=i

# print(s2)
text1=s2.split()
print(text1)

c=''
with open("out2.txt") as f:
    while True:
        line = f.readline()
        c+=line
        if not line:
            break;
c2=''

for i in c:
    if i is not '\n':
        c2+=i

# print(c2)
text2=c2.split()
print(text2)

print("-----------------------------------创建词汇------------------------------------")
vocabulary=[]
vocabulary=text1+text2
# print(vocabulary)
vocabulary=list(set(vocabulary))
print(vocabulary)
print("-----------------------------------创建文本的向量矩阵:start---------------------------------------")
#创建文本1的向量矩阵
arr1=[]
for t in vocabulary:
    if text1.count(t):
        arr1.append(text1.count(t))
    else:
        arr1.append(0)
print(arr1)
#创建文本2的向量矩阵
arr2=[]
for t in vocabulary:
    if text2.count(t):
        arr2.append(text2.count(t))
    else:
        arr2.append(0)
print(arr2)
print("-----------------------------创建文本的向量矩阵:end------------------------------------")
# print(len(vocabulary))
# print(len(arr1))
# print(len(arr2))
print("-----------------------------TF:start------------------------------------")
#计算词频TF
def compute_tf(list_words):
    tf_list=[]
    for i in list_words:
        tf_list.append(i/len(list_words))
    return tf_list

arr1_tf=compute_tf(arr1)
print(arr1_tf)

arr2_tf=compute_tf(arr2)
print(arr2_tf)
print("-----------------------------TF:end------------------------------------")

print("-----------------------------IDF:start------------------------------------")
#计算词语出现在文档的次数
def count_words(text1,text2):
    text_conut_arr=[0]*len(vocabulary)
    # print(text_conut_arr)
    # count=0
    # for i in range(0,len(text)):
    #     if text[i].
    for i in range(0,len(vocabulary)):
        # print(vocabulary[i])
        if vocabulary[i] in text1:
            text_conut_arr[i]+=1
            if vocabulary[i] in text2:
                text_conut_arr[i]+=1
    return text_conut_arr

#文档一词语出现在文档数的向量
c1=count_words(text1,text2)
print(c1)
#文档二词语出现在文档数的向量
c2=count_words(text2,text1)
print(c2)

#计算逆向文件频率:IDF
def file_idf(c1):
    idf_arr=[]
    for i in c1:
        idf_arr.append(math.log(text_sum/(i+1)))
    return idf_arr

arr1_idf=file_idf(c1)
print(arr1_idf)
arr2_idf=file_idf(c2)
print(arr2_idf)
print("-----------------------------IDF:end------------------------------------")

print("---------------------------------计算TF-IDF的向量矩阵:start-----------------------------------------")
# print(arr1_tf)
# print(arr1_idf)
#计算TF-IDF的向量矩阵
def tf_idf(arr_tf,arr_idf):
    tfidf_arr=[]
    for i in arr_tf:
        for j in arr_idf:
            tfidf_arr.append(i*j)
    return tfidf_arr

arr1_tfidf=tf_idf(arr1_tf,arr1_idf)
print(arr1_tfidf)
arr2_tfidf=tf_idf(arr2_tf,arr2_idf)
print(arr2_tfidf)
print("---------------------------------计算TF-IDF的向量矩阵:end-----------------------------------------")

print("----------------------------余弦相似度--------------------------------")
#余弦相似度
def cosine_similarity(x, y, norm=False):
    assert len(x) == len(y), "len(x) != len(y)"
    zero_list = [0] * len(x)
    if x == zero_list or y == zero_list:
        return float(1) if x == y else float(0)
    res = np.array([[x[i] * y[i], x[i] * x[i], y[i] * y[i]] for i in range(len(x))])
    cos = sum(res[:, 0]) / (np.sqrt(sum(res[:, 1])) * np.sqrt(sum(res[:, 2])))
    return 0.5 * cos + 0.5 if norm else cos

similarity=cosine_similarity(arr1_tfidf,arr2_tfidf)
print("这两篇文档的相似度为：{:%}".format(similarity))
print(similarity)

